Here are our required libraries;
knitr::opts_chunk$set(echo = TRUE,warning=FALSE)
library(tidyverse)
library(readxl)
library(forcats)
library(rlang)
library(scales)
library(knitr)
library(lubridate)
library(RColorBrewer)
library(plotly)
library(stringr)
library(data.table)
library(kableExtra)
library(splitstackshape)
library(grid)
library(gridExtra)
library(dplyr)
We obtained energy plant cuts data in Turkey between 2012 and 2018 from the Energy Transparency Platform. Data contains 73311 rows and 7 variables.
Objectives of this project is as follows:
# Create a temporary file
tmp<-tempfile(fileext=".csv")
# Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/gpj18-first/blob/master/dataset_candidates/ArizaBakim-01012008-25112018.csv?raw=true",destfile=tmp)
# Reading data. Encoding as Latin-1 is crucial in order to see Turkish characters properly.
cuts<-read.csv(tmp, encoding= "Latin-1", sep = ";")
colnames(cuts) <-c("Plant.Name","UEVCB","Start.Date","End.Date","Established.Power","Power.atOutage","Reason")
# Checking what we have as data.
str(cuts)
## 'data.frame': 74036 obs. of 7 variables:
## $ Plant.Name : Factor w/ 891 levels ""," "," 2.Ünite kazan siklonunda kısmi curuflanma ve vortex silindirinde kısmi yamulmadan dolayı ünite servis harici edildi. ",..: 733 889 345 222 66 508 401 572 730 485 ...
## $ UEVCB : Factor w/ 748 levels ""," AES ENTEK ELEKTRİK ÜRETİMİ A.Ş. (KOCAELİ)",..: 635 744 297 186 14 411 342 473 625 389 ...
## $ Start.Date : Factor w/ 62953 levels "","1.1.2013 00:00",..: 16880 13994 4693 17488 39657 24102 24857 28100 46422 60030 ...
## $ End.Date : Factor w/ 42169 levels "","1.1.2013 16:59",..: 11246 9366 3031 11662 26567 16055 16567 18722 31153 40256 ...
## $ Established.Power: Factor w/ 595 levels "","0","0,06",..: 256 443 120 224 480 489 531 336 379 420 ...
## $ Power.atOutage : Factor w/ 837 levels "","-100","-110",..: 107 9 9 9 9 439 308 9 123 9 ...
## $ Reason : Factor w/ 31166 levels "","'F'DEĞİRMENİ ARIZASI",..: 4148 27233 12956 24146 15255 1195 11705 11670 4519 7164 ...
# Change type of time and date
cuts$Start.Date <- dmy_hm(cuts$Start.Date)
cuts$End.Date<- dmy_hm(cuts$End.Date)
# Turn power capacity variables into numeric
cuts$Established.Power <- str_replace(cuts$Established.Power,"[:punct:]","")
cuts$Power.atOutage <- str_replace(cuts$Power.atOutage,"[:punct:]","")
cuts$Established.Power <- str_replace(cuts$Established.Power,",","")
cuts$Power.atOutage <- str_replace(cuts$Power.atOutage,",","")
cuts$Established.Power <- as.factor(cuts$Established.Power)
cuts$Power.atOutage <- as.factor(cuts$Power.atOutage)
cuts$Established.Power <-as.numeric(levels(cuts$Established.Power))[cuts$Established.Power]
cuts$Power.atOutage <-as.numeric(levels(cuts$Power.atOutage))[cuts$Power.atOutage]
cuts[c("Established.Power","Power.atOutage")]<-cuts[c("Established.Power","Power.atOutage")]/100
# Removing data that contains NA's.
cuts <- na.omit(cuts)
#Drop UEVÇB column, we do not think it will be any help for analysis.
cuts <- subset(cuts, select=-UEVCB)
# Let's check the data for yearly total cuts.
yearly_cuts <- cuts %>% group_by(year=floor_date(Start.Date,"year")) %>% summarize(Start.Date=n())
yearly_cuts
## # A tibble: 7 x 2
## year Start.Date
## <dttm> <int>
## 1 2012-01-01 00:00:00 338
## 2 2013-01-01 00:00:00 4791
## 3 2014-01-01 00:00:00 10650
## 4 2015-01-01 00:00:00 12684
## 5 2016-01-01 00:00:00 12763
## 6 2017-01-01 00:00:00 10880
## 7 2018-01-01 00:00:00 21207
Look like there is an average of 10-12k cuts per year in the database with the exception of 2018, where the number of cuts seems to be much higher. We believe this is simply the result of transparency platform adding more data from plants or more plants all together to its database in 2018 to make it more thorough.
# Let's put the yearly cuts into a bar graph.
ggplot(data=yearly_cuts, aes(y=Start.Date, x=factor(year(year)), fill=factor(year(year))))+
geom_bar(stat="identity")+
labs(x="Year", y="Incident Count", title="Yearly Total Incidents")+
theme_light()+
scale_fill_brewer(palette="PuBuGn")+
theme(legend.position="none")
The database has mostly text based data. Initial overview of the database shows a lot of errors, some of which come from localization problems, others simply misspelled. In this part, we’ve attempted to make it more tidy by making plant names more distinct, cut reasons more clear.
#Adding süre variable as duration of the cuts as hours and capacity usage ratio at time of cut as capacity usage's ratio to total capacity.
cuts <- cuts %>% mutate(sure = difftime(End.Date,Start.Date,units="hours")) %>% mutate(kapasiteorani = Power.atOutage / Established.Power)
#cuts$kapasiteorani <- percent(cuts$kapasiteorani)
# Make all strings lower case so that can be cleaned easier.
cuts <- cuts %>% mutate_at(.vars=c("Plant.Name", "Reason"), funs(str_to_lower(.,locale="tr")))
#With new variables let's see our data.
kable(cuts[1:5,]) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| Plant.Name | Start.Date | End.Date | Established.Power | Power.atOutage | Reason | sure | kapasiteorani |
|---|---|---|---|---|---|---|---|
| silopi tes | 2016-10-17 08:10:00 | 2016-10-17 10:59:00 | 2.70 | 1.2 | 3.ünite kazan boru patlağından dolayı servis harici | 2.816667 hours | 0.4444444 |
| zorlu enerji lüleburgaz santralı | 2016-06-15 15:56:00 | 2016-06-15 15:59:00 | 57.96 | 0.0 | türbin stoll detected trip | 0.050000 hours | 0.0000000 |
| enerjisa - hacınınoğlu | 2018-10-11 00:22:00 | 2018-10-11 05:59:00 | 142.28 | 0.0 | göl seviyesi | 5.616667 hours | 0.0000000 |
| cengiz 240mw samsun gaz yakıtlı kombine çevrim enerji santrali | 2016-03-17 19:01:00 | 2016-03-17 22:59:00 | 23.89 | 0.0 | soğutma suyu arızası | 3.966667 hours | 0.0000000 |
| acarsoy denizli doğalgaz santrali | 2014-11-27 08:44:00 | 2014-11-27 23:59:00 | 0.63 | 0.0 | gt senkron problemi | 15.250000 hours | 0.0000000 |
#Detect similiar names with Fuzzy Text matching
uniqueNames <- unique(cuts$Plant.Name)
name_distances <- list()
i <- 1
for (ind in uniqueNames){
name_distances[[i]] <- agrep(ind, uniqueNames, value=T)
i <- i+1
}
name_distances <- unique(Filter(function(x) {length(x) > 1}, name_distances))
#Fixing detected string issues.
cuts$Plant.Name <- cuts$Plant.Name %>%
str_replace_all("[İı]", "i") %>%
str_replace_all("enerj.sa", "enerjisa") %>%
str_replace_all("yeniköy ts", "yeniköy tes") %>%
str_replace_all("^ova elektrik", "gebze ova elektrik") %>%
str_replace_all("yatağan .*", "yatağan tes") %>%
str_replace_all("köklüce$", "köklüce hes") %>%
str_replace_all(".* entek", "entek") %>%
str_replace_all("kürtün-hes", "kürtün hes") %>%
str_replace_all("^rwe_turcas_guney", "denizli rwe_turcas_guney") %>%
str_replace_all("tekirdağ santrali.*", "modern enerji tekirdağ santrali") %>%
str_replace_all("karadağ$", "karadağ res") %>%
str_replace_all(".?menzelet( hes)?", "menzelet hes") %>%
str_replace_all("\\.", "") %>%
str_replace_all("hidro(\\s?elektrik santral[ıi]| e\\.?s)", " hes") %>%
str_replace_all("(termik santral[ıi]|\\sts\\s?)", " tes") %>%
str_replace_all("tunçbilektes", "tunçbilek tes") %>%
str_replace_all("d.*(k.*)?ç.*(s.*)?", "dgkç") %>%
str_replace_all("jeotermal (e.*s.*)", "jes")
cuts$Reason <- cuts$Reason %>%
str_replace_all("[İı]", "i") %>%
str_replace_all("t(.r|r.)b.n", "türbin") %>%
str_replace_all("ar.zas.?.?", "ariza") %>%
str_replace_all("so.utma", "sogutma") %>%
str_replace_all("(?<!\\d)\\.", "") %>%
str_replace_all(".n.te", "unite") %>%
str_replace_all("reg.lat.r", "regulator")
In order to categorise the reasons, we performed a word count Gerekçe column, laying out most encountered words.
gerekcewordcount <- cSplit(cuts, "Reason", sep = " ", direction = "long") %>%
group_by(Reason) %>%
summarise(Count = n())
arrange(gerekcewordcount,desc(Count))
## # A tibble: 12,910 x 2
## Reason Count
## <fct> <int>
## 1 ariza 37310
## 2 türbin 7271
## 3 kazan 5291
## 4 unite 4863
## 5 dolayi 4460
## 6 gaz 4189
## 7 sogutma 3439
## 8 suyu 3235
## 9 su 3154
## 10 sistemi 2874
## # ... with 12,900 more rows
Based on the plant name, we created a new column called Plant.Type which stores the type of the plant.
The abbreviations are as following: HES : Hydroelectricty Plant RES : Wind Energy Plant TES : Thermal Power Plant DGKÇ: Natural Gas Combined Cycle Power Plant JES : Geothermal Energy Plant BES : Biomass Energy Plant/Biogas Plant
#Categorising type of plants so we can do type based analysis later on.
cuts<- cuts %>%
mutate(Plant.Type=ifelse(grepl("hes", cuts$Plant.Name, ignore.case = T), "HES",
(ifelse(grepl(" res\\s?|rüzgar", cuts$Plant.Name, ignore.case = T), "RES",
(ifelse(grepl("( tes\\s?|termik santral|ithal kömür|bolu göynük|eskişehir endüstriyel|aliağa çakmaktepe|enerjisa tufanbeyli|ataer)", cuts$Plant.Name, ignore.case = T), "TES",
(ifelse(grepl("(d.*k.*ç.*(s.*)?|bosen|acarsoy denizli|akenerji|ambarli|m.*osb|kombine|paner|enerjisa (bandirma|kentsa)|kojsant|zorlu enerji|rwe_turcas|kojen|ugur enerji|isbirligi-enerji|gebze ova elektrik)", cuts$Plant.Name, ignore.case = T), "DGKÇ",
(ifelse(grepl("(jes|jeotermal)", cuts$Plant.Name, ignore.case =T), "JES",
(ifelse(grepl("biyokütle|biogaz", cuts$Plant.Name, ignore.case =T), "BES",
"Other"))))))))))))
cuts$Plant.Type <- as.factor(cuts$Plant.Type)
Let’s take a look at our final dataset.
str(cuts)
## 'data.frame': 73313 obs. of 9 variables:
## $ Plant.Name : chr "silopi tes" "zorlu enerji lüleburgaz santrali" "enerjisa - hacininoğlu" "cengiz 240mw samsun gaz yakitli kombine çevrim enerji santrali" ...
## $ Start.Date : POSIXct, format: "2016-10-17 08:10:00" "2016-06-15 15:56:00" ...
## $ End.Date : POSIXct, format: "2016-10-17 10:59:00" "2016-06-15 15:59:00" ...
## $ Established.Power: num 2.7 57.96 142.28 23.89 0.63 ...
## $ Power.atOutage : num 1.2 0 0 0 0 4.2 3 0 1.35 0 ...
## $ Reason : chr "3.unite kazan boru patlağindan dolayi servis harici" "türbin stoll detected trip" "göl seviyesi " "sogutma suyu ariza" ...
## $ sure : 'difftime' num 2.81666666666667 0.05 5.61666666666667 3.96666666666667 ...
## ..- attr(*, "units")= chr "hours"
## $ kapasiteorani : num 0.444 0 0 0 0 ...
## $ Plant.Type : Factor w/ 7 levels "BES","DGKÇ","HES",..: 7 2 5 2 2 7 2 2 7 1 ...
Here are the explanations for variables:
Santral İsmi : Name of the power plant. Olay başlangıç tarihi : Start date time of the cut. Olay bitiş tarihi : End date/time of the cut. İşletmedeki kurulu güç : Total power at the plant. Olay sırasındaki kapasite : Capacity at the time of incident Gerekçe : Reason of the cut sure : Length of the cut in hours. kapasiteorani : Proportion of the capacity at the time of the cut to max capacity. Type : Type of the plant
cuts %>%
select(Plant.Type, Plant.Name, Established.Power) %>%
distinct(Plant.Name, Plant.Type, Established.Power) %>%
group_by(Plant.Type) %>%
summarize(Mean=mean(Established.Power), Total=sum(Established.Power)) %>%
ggplot(.)+
geom_bar(aes(x=reorder(Plant.Type, -Mean), y=Mean, fill=Plant.Type), stat="identity")+
geom_text(aes(x=Plant.Type, y=Total/100, label=signif(Total, 2)))+
labs(x="", y="Average Power Output MWe", title="Power Output Based on Plant Type in Turkey", x="Plant Type")+
theme_light()+
scale_fill_brewer(palette="Greens")+
theme(legend.position="none")+
scale_y_continuous(sec.axis=sec_axis(~.*100, name="Total Power Output MWe"))
While the average capacity of Thermal and NGCC plants are much higher than other plant types, Hydroelectricity plants provide majority of the energy demand of Turkey. Geothermal, Wind and Bioenergy Plants have little contribution on Turkey’s energy output. There are no registered Solar Energy Plants.
There are two types of cuts at the overall level. Either due to a malfunction or due to a planned activity such as turnaround maintenances, tests, capacity reductions due to economic reasons or supply demand balance etc. Let’s separate them according to type of cut. We searched for words that imply an planned activity at the reason of cut and assigned “Planned Activity” as variable to a new column. All others are assigned to “Malfunction”.
cuts <- cuts%>% mutate(TypeofCut=ifelse(grepl("(bak.m|[cç]al[ıi][sş]ma|devreye alma|yük alma|test|planl[ıi]|devre di[sş]i)", Reason, ignore.case = T, perl=T), "Planned Activity", "Malfunction"))
Our histogram that shows distribution of the cuts durations looks messy due to outliers. Here is a quick workaround by fixing outliers to a maximum value inspired by https://edwinth.github.io/blog/outlier-bin/. Same person has a nice package to deal with such situations, however its dependencies does not work with Shiny.
It looks like number of cuts are mostly between 0-5 hour with some outliers. Looks like most malfunctions are solved in less than 5 hours.When we look at the capacity usage ratio at the malfunctions, there is a stack between 0-10%. However except that distribution is close to normal.
cuts %>% mutate(sure_outlierfixed = ifelse(sure > 24, 24, sure))%>%
ggplot(aes(x=sure_outlierfixed))+
geom_histogram()+
facet_wrap(~TypeofCut)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Note this part has to be polished.
cuts$kapasiteorani <- as.numeric(sub("%","",cuts$kapasiteorani))/100
ggplot(cuts,aes(x=kapasiteorani))+
geom_histogram()+
facet_wrap(~TypeofCut)+
scale_x_continuous(limits = c(0,1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Let’s see how many malfunctions the power plants have reported since late 2014.
#Gather plants that reported more than 1000 malfunctions in the last 6 years
m_count <- cuts %>%
filter(TypeofCut=="Malfunction") %>%
group_by(Plant.Name) %>%
summarize(m_count=n()) %>%
arrange(desc(m_count)) %>%
filter(m_count >= 1000)
m_plants <- as.vector(m_count$Plant.Name)
malf <- cuts %>%
filter(Plant.Name %in% m_plants, TypeofCut=="Malfunction") %>%
mutate(quarter=lubridate::quarter(Start.Date, with_year = T)) %>%
group_by(Plant.Name, quarter) %>%
summarize(malf=n())
malf$quarter=as.character(malf$quarter)
ggplotly(
ggplot(malf, aes(x=quarter))+
coord_flip()+
theme_bw()+
geom_bar(aes(y=malf, fill=Plant.Name), stat="identity")+
theme(legend.position = "bottom", legend.title = element_text("Plant Name"))+
labs(x="Quarter", y="Malfunction Count", title="Quarterly Fault Count of Top Frequently Malfunctioning Plants")
)
When we look at the graph, we can roughly see that thermal plants make up most of the top malfunctioning plants in Turkey. 2018’s 3rd quarter showed a spike in reported malfunctions, from our graph it seems like two major plants had numerous faults during this quarter, eren enerji and sebenoba reporting 502 and 482 cuts, respectively. A power plant is a huge facility that utilizes large numbers of various equipments. A fault may occur on any of them. So, to get a more explanatory outcome, we’ve decided to categorise these cuts by malfunction type based on the reason written.
#Categorise malfunctions
catmalf <- cuts %>%
filter(TypeofCut=="Malfunction") %>%
mutate(Malf.Category =ifelse(grepl("(t.rb[ıi]n|g.?t.|kompr[ae]s[oö]r|pompa|fan|de[gğ]irmen|vibrasyon|makin[ea]|trip|motor|govern[oöe]r|ate[sş\\?]leme|ayar kanat?|hidrolik start|c[uü]ruf)", Reason, ignore.case = T, perl=T), "Rotating Equipment Failure",
(ifelse(grepl("(reg[uü]lat[oö]r|trafo|(elektrik|enerji) kesinti|so[gğ]utma su.?.?|[gj]enerat[oö]r|elektriksel|154|bara|[sş]ebeke|santral)", Reason, ignore.case = T, perl=T), "Electrical or Other Utilities Failure",
(ifelse(grepl("((?<!\\w)su |k[oö\\?]m[üu\\?]r|gaz basin[çc]?)", Reason, ignore.case = T, perl=T), "Feedstock Issues",
(ifelse(grepl("(vana|kazan|boru|hatt[ıi]|e[şs]anjör|val(f|ve)|kablo|air preheater|ka.ak)", Reason, ignore.case = T, perl=T), "Static Equipment Failure",
(ifelse(grepl("(plc|dcs|haberle[şs]me|otom[oa]syon|scada|[ei]nstr[uü]?m.{2,3} (hava|air)|[\\?i]kaz)", Reason, ignore.case = T, perl=T), "Control and Automation Systems Failure",
(ifelse(grepl("(bo[ğg]ulma|çiftçi|ara[çc] dü[şs]|tarim|atmosfer(ik)?|ya[gğ][ıi][şs])", Reason, ignore.case = T, perl=T), "Outside Factors",
(ifelse(grepl("(^(sistem )?ar.za(si|nin)?( devam.)?$|^$|[uü]nite ar[ıi]za(s[iı])?)", Reason, ignore.case = T, perl=T), "Unspecified",
"Other"))))))))))))))
So now there are several different malfunction types from rotating equipments such as pumps, turbines to static equipments like pipelines and heat exchangers. Electrical or other utility based cuts are numerous too, note that electrical failures here are problems in electricity that power plant equipments use, not the electricity they produce, therefore they are counted as utilities. There are a few strange cases too, a hydroelectricity power plant shutting down due to a car falling into the dam lake is one of such cases.
m_by_type<- catmalf %>%
group_by(Plant.Type, Malf.Category) %>%
filter(Plant.Type!="Other")%>%
count() %>%
ungroup()%>%
group_by(Plant.Type)%>%
mutate(perc=`n`/sum(`n`))
#Plot pie charts for most occured malfunction type
plot_ly(textposition = 'inside',
textinfo = 'label+percent',
insidetextfont = list(color = '#FFFFFF'),
marker = list(colors = colors,
line = list(color = '#FFFFFF', width = 1))) %>%
add_pie(data = subset(m_by_type, Plant.Type=="TES"), labels = ~Malf.Category, values = ~n,
name = "Thermal Energy Plant", domain = list(x = c(0, 0.35), y = c(0.50, 0.95))) %>%
add_pie(data = subset(m_by_type, Plant.Type=="HES"), labels = ~Malf.Category, values = ~n,
name = "Hydroelectricity Plant", domain = list(x = c(0.35, 1), y = c(0.50, 0.95))) %>%
add_pie(data = subset(m_by_type, Plant.Type=="RES"), labels = ~Malf.Category, values = ~n,
name = "Wind Energy Plant", domain = list(x = c(0, 0.35), y = c(0, 0.45))) %>%
add_pie(data = subset(m_by_type, Plant.Type=="JES"), labels = ~Malf.Category, values = ~n,
name = "Geothermal Energy Plant", domain = list(x = c(0.35, 1), y = c(0, 0.45))) %>%
layout(title = "Malfunction Type by Plant", showlegend = F,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = TRUE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
annotations = list(
list(x = 0.09 , y = 1.0, text = "Thermal Energy Plant", showarrow = F, xref='paper', yref='paper'),
list(x = 0.8 , y = 1.0, text = "Hydroelectricity Plant", showarrow = F, xref='paper', yref='paper'),
list(x = 0.1 , y = 0.47, text = "Wind Turbine", showarrow = F, xref='paper', yref='paper'),
list(x = 0.8 , y = 0.47, text = "Geothermal Energy Plant", showarrow = F, xref='paper', yref='paper')))
Thermal Plants have suffered from static equipment and rotating equipment failures almost in equal cases, hydroelectricty plants have had many electrical and utility issues. Being mostly comprised of rotating equipments, wind turbines’ majority of problems come from rotating equipments and among the reports of geothermal plants, almost half of them were electrical or other utility problems.
Refs
https://plot.ly/r/bubble-charts/ https://edwinth.github.io/blog/outlier-bin/